STATISTICAL LEARNING INDIVIDUAL PROJECT

Master’s Degree in Data Science for Economics

Università degli Studi di Milano

Author: Guglielmo Berzano

October 2023
library(tidyverse) #increases R dictionary
## Warning: package 'tidyverse' was built under R version 4.2.3
## Warning: package 'ggplot2' was built under R version 4.2.3
## Warning: package 'tibble' was built under R version 4.2.3
## Warning: package 'tidyr' was built under R version 4.2.3
## Warning: package 'readr' was built under R version 4.2.3
## Warning: package 'purrr' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## Warning: package 'stringr' was built under R version 4.2.3
## Warning: package 'forcats' was built under R version 4.2.3
## Warning: package 'lubridate' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(magrittr) #increases R dictionary
## 
## Attaching package: 'magrittr'
## 
## The following object is masked from 'package:purrr':
## 
##     set_names
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.2.3
## 
## Attaching package: 'reshape2'
## 
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(Hmisc)
## Warning: package 'Hmisc' was built under R version 4.2.3
## 
## Attaching package: 'Hmisc'
## 
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## 
## The following objects are masked from 'package:base':
## 
##     format.pval, units
library(ggpubr)
## Warning: package 'ggpubr' was built under R version 4.2.3
library(corrplot) #creation of the correlation plot
## Warning: package 'corrplot' was built under R version 4.2.3
## corrplot 0.92 loaded
library(car)
## Warning: package 'car' was built under R version 4.2.3
## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some
library(leaps) #best subset
## Warning: package 'leaps' was built under R version 4.2.3
library(robustbase) #robust regression
## Warning: package 'robustbase' was built under R version 4.2.3
library(olsrr) #chart for residuals
## Warning: package 'olsrr' was built under R version 4.2.3
## 
## Attaching package: 'olsrr'
## 
## The following object is masked from 'package:datasets':
## 
##     rivers
library(rpart) #regression trees
## Warning: package 'rpart' was built under R version 4.2.3
library(rpart.plot) #plotting regression trees
## Warning: package 'rpart.plot' was built under R version 4.2.3
library(randomForest) #random forest algorithm
## Warning: package 'randomForest' was built under R version 4.2.3
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(caret)
## Warning: package 'caret' was built under R version 4.2.3
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 4.2.3
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(ggfortify) #pca plotting
## Warning: package 'ggfortify' was built under R version 4.2.3
library(factoextra)#pca plotting
## Warning: package 'factoextra' was built under R version 4.2.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(rgl) #3d plots for PCA, just for fun
## Warning: package 'rgl' was built under R version 4.2.3
me_cars<- read.csv("https://raw.githubusercontent.com/guber25/ArabicPeninsulaCars/main/Dataset/Car%20in%20the%20middle%20east.csv")
head(me_cars)
##   Engine.Capacity..liters. Cylinders        Drive.Type
## 1                      1.2         3 Front Wheel Drive
## 2                      1.2         3 Front Wheel Drive
## 3                      1.4         4 Front Wheel Drive
## 4                      1.6         4 Front Wheel Drive
## 5                      1.5         4 Front Wheel Drive
## 6                      1.4         4 Front Wheel Drive
##   Fuel.Tank.Capacity..liters. Fuel.Economy..L.100.Km. Fuel.Type
## 1                          42                     4.9    Petrol
## 2                          42                     4.9    Petrol
## 3                          45                     6.3    Petrol
## 4                          50                     6.4    Petrol
## 5                          48                     5.8    Petrol
## 6                          35                     5.1    Petrol
##   Horsepower..bhp. Torque..Nm. Transmission Top.Speed..Km.h. Seating.Capacity
## 1               76         100    Automatic              170         5 Seater
## 2               76         100    Automatic              170         5 Seater
## 3               75         118       Manual              156         2 Seater
## 4              102         145    Automatic              180         5 Seater
## 5              112         150    Automatic              170         5 Seater
## 6               98         127    Automatic              170         5 Seater
##   Acceleration.0.100.Km.h..sec. Length..meters. Width..meters. Height..meters.
## 1                          14.0           4.245          1.670           1.515
## 2                          14.0           4.245          1.670           1.515
## 3                          16.0           3.864          1.716           1.721
## 4                          11.0           4.354          1.994           1.529
## 5                          10.9           4.314          1.809           1.624
## 6                          12.0           3.636          1.597           1.483
##   Wheelbase..meters. Trunk.Capacity..liters.
## 1              2.550                     450
## 2              2.550                     450
## 3              2.513                    2800
## 4              2.635                     510
## 5              2.585                     448
## 6              2.385                     314
##                                     name  price currency
## 1 Mitsubishi Attrage 2021 1.2 GLX (Base) 34,099      SAR
## 2 Mitsubishi Attrage 2021 1.2 GLX (Base) 34,099      SAR
## 3        Fiat Fiorino 2021 1.4L Standard 41,250      SAR
## 4            Renault Symbol 2021 1.6L PE 44,930      SAR
## 5                    MG ZS 2021 1.5L STD 57,787      SAR
## 6           Chevrolet Spark 2021 1.4L LS 53,790      SAR

DATA PREPROCESSING

#Filtering out the null values
me_cars<-me_cars %>% filter(price != " TBD " &
                              Cylinders != "N/A" &
                              Trunk.Capacity..liters. != "N/A" &
                              Trunk.Capacity..liters. != "Null")


#Changing the type of some columns (from char to dbl) and removing strange values

me_cars$price <- gsub(",", "", me_cars$price) %>% as.numeric() %>% round(2)

me_cars$Cylinders <- as.numeric(me_cars$Cylinders)

me_cars$Trunk.Capacity..liters. <- gsub(" \\(.*\\)", "", me_cars$Trunk.Capacity..liters.) %>% as.numeric()
## Warning in gsub(" \\(.*\\)", "", me_cars$Trunk.Capacity..liters.) %>%
## as.numeric(): NAs introduced by coercion
me_cars$Fuel.Tank.Capacity..liters. <- gsub(" \\(.*\\)", "",me_cars$Fuel.Tank.Capacity..liters.) %>% as.numeric()

me_cars$Seating.Capacity <- gsub("Seater", "",me_cars$Seating.Capacity) %>% as.numeric()

me_cars$Wheelbase..meters.<- gsub("  ", "", me_cars$Wheelbase..meters.) %>% as.numeric() %>% round(2)

me_cars$Torque..Nm.  %<>%  as.numeric

me_cars$Length..meters. %<>% round(2)

me_cars$Width..meters. %<>% round(2)

me_cars$Height..meters. %<>% round(2)

#Changing the type of other columns (from char to fact)

me_cars$Drive.Type %<>% as.factor
me_cars$Fuel.Type %<>% as.factor
me_cars$Transmission %<>% as.factor
me_cars$currency %<>% as.factor

#Renaming some variables so that they are easier to understand

me_cars <- me_cars %>% rename("Engine_Capacity"="Engine.Capacity..liters.",
                              "Driving"="Drive.Type",
                              "Fuel_Capacity"="Fuel.Tank.Capacity..liters.",
                              "Liters_For_100km"="Fuel.Economy..L.100.Km.",
                              "Fuel_Type"="Fuel.Type",
                              "Horsepower"="Horsepower..bhp.",
                              "Torque"="Torque..Nm.",
                              "Top_Speed"="Top.Speed..Km.h.",
                              "Seating_Capacity"="Seating.Capacity",
                              "Acceleration_0100"="Acceleration.0.100.Km.h..sec.",
                              "Length"="Length..meters.",
                              "Width"="Width..meters.",
                              "Height"="Height..meters.",
                              "Wheelbase"="Wheelbase..meters.",
                              "Trunk_Capacity"="Trunk.Capacity..liters.",
                              "Price"="price",
                              "Currency"="currency",
                              "Name"="name")


#EXCHANGE RATES RETRIEVED ON JUNE, FRIDAY 23th
me_cars$PriceEURO<-ifelse(me_cars$Currency=="SAR", me_cars$Price*.25,
                          ifelse(me_cars$Currency=="AED", me_cars$Price*.25,
                                 ifelse(me_cars$Currency=="BHD", me_cars$Price*2.44,
                                        ifelse(me_cars$Currency=="KWD", me_cars$Price*2.99,
                                               ifelse(me_cars$Currency=="OMR", me_cars$Price*2.39, me_cars$Price*.25)))))

#Creating the column Area
me_cars$Area<-ifelse(me_cars$Currency=="SAR", "Saudi Arabia",
                          ifelse(me_cars$Currency=="AED", "UAE",
                                 ifelse(me_cars$Currency=="BHD", "Bahrain",
                                        ifelse(me_cars$Currency=="KWD", "Kuwait",
                                               ifelse(me_cars$Currency=="OMR", "Oman", "Qatar")))))
me_cars$Area %<>% as.factor
me_cars %>% head()
##   Engine_Capacity Cylinders           Driving Fuel_Capacity Liters_For_100km
## 1             1.2         3 Front Wheel Drive            42              4.9
## 2             1.2         3 Front Wheel Drive            42              4.9
## 3             1.4         4 Front Wheel Drive            45              6.3
## 4             1.6         4 Front Wheel Drive            50              6.4
## 5             1.5         4 Front Wheel Drive            48              5.8
## 6             1.4         4 Front Wheel Drive            35              5.1
##   Fuel_Type Horsepower Torque Transmission Top_Speed Seating_Capacity
## 1    Petrol         76    100    Automatic       170                5
## 2    Petrol         76    100    Automatic       170                5
## 3    Petrol         75    118       Manual       156                2
## 4    Petrol        102    145    Automatic       180                5
## 5    Petrol        112    150    Automatic       170                5
## 6    Petrol         98    127    Automatic       170                5
##   Acceleration_0100 Length Width Height Wheelbase Trunk_Capacity
## 1              14.0   4.24  1.67   1.51      2.55            450
## 2              14.0   4.24  1.67   1.51      2.55            450
## 3              16.0   3.86  1.72   1.72      2.51           2800
## 4              11.0   4.35  1.99   1.53      2.63            510
## 5              10.9   4.31  1.81   1.62      2.58            448
## 6              12.0   3.64  1.60   1.48      2.38            314
##                                     Name Price Currency PriceEURO         Area
## 1 Mitsubishi Attrage 2021 1.2 GLX (Base) 34099      SAR   8524.75 Saudi Arabia
## 2 Mitsubishi Attrage 2021 1.2 GLX (Base) 34099      SAR   8524.75 Saudi Arabia
## 3        Fiat Fiorino 2021 1.4L Standard 41250      SAR  10312.50 Saudi Arabia
## 4            Renault Symbol 2021 1.6L PE 44930      SAR  11232.50 Saudi Arabia
## 5                    MG ZS 2021 1.5L STD 57787      SAR  14446.75 Saudi Arabia
## 6           Chevrolet Spark 2021 1.4L LS 53790      SAR  13447.50 Saudi Arabia

Map for first page of the report

mec_map0 <- data.frame(region=c("Bahrain", "Kuwait", "Qatar", "Oman", "Saudi Arabia", "United Arab Emirates"),
                       column=c(seq(1,6,1)))

mapdata<-map_data("world")
mapdata <- left_join(mapdata, mec_map0, by="region") 
mapdata<-mapdata %>% filter(!is.na(mapdata$column))

ggplot(mapdata, aes(x=long, y=lat, group=group))+
  
  geom_polygon(aes(fill=region), col="black")+
  scale_fill_manual(name="Countries", breaks=c("Bahrain", "Kuwait", "Oman", "Qatar", "Saudi Arabia", "United Arab Emirates"),
    values=c("azure","skyblue", "deepskyblue", "dodgerblue", "blue3", "navy"))+

  
  theme(plot.title = element_text(face="bold"),
    axis.text.x = element_blank(),
        axis.text.y = element_blank(),
        axis.ticks = element_blank(),
        axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        rect = element_blank())

DATA EXPLORATION

me_cars %>% select(-Driving,-Fuel_Type,-Transmission,-Name,-Currency,-Area,-Price) %>% gather() %>% 
  ggplot(aes(value)) + 
    geom_boxplot(fill="skyblue", color="black", outlier.colour = "deepskyblue") + 
    facet_wrap(~key, scales = 'free')+
  theme_minimal()
## Warning: Removed 2 rows containing non-finite values (`stat_boxplot()`).

It is clear that there are a lot of outliers. It is important to remove the most extreme ones. The criterion used to decide which is an outlier and which is not is just common sense, i.e. it is impossible that a car is 4500m long.

Outlier removal

value = me_cars[,"Height"][me_cars[,"Height"]>500]
me_cars[,"Height"][me_cars[,"Height"] %in% value] = NA
me_cars = drop_na(me_cars)

value = me_cars[,"Length"][me_cars[,"Length"]>500]
me_cars[,"Length"][me_cars[,"Length"] %in% value] = NA
me_cars = drop_na(me_cars)

value = me_cars[,"Wheelbase"][me_cars[,"Wheelbase"]>500]
me_cars[,"Wheelbase"][me_cars[,"Wheelbase"] %in% value] = NA
me_cars = drop_na(me_cars)

value = me_cars[,"Width"][me_cars[,"Width"]>500]
me_cars[,"Width"][me_cars[,"Width"] %in% value] = NA
me_cars = drop_na(me_cars)

value = me_cars[,"Horsepower"][me_cars[,"Horsepower"]>1250]
me_cars[,"Horsepower"][me_cars[,"Horsepower"] %in% value] = NA
me_cars = drop_na(me_cars)


value = me_cars[,"Trunk_Capacity"][me_cars[,"Trunk_Capacity"]>1300]
me_cars[,"Trunk_Capacity"][me_cars[,"Trunk_Capacity"] %in% value] = NA
me_cars = drop_na(me_cars)

value = me_cars[,"PriceEURO"][me_cars[,"PriceEURO"]>750000]
me_cars[,"PriceEURO"][me_cars[,"PriceEURO"] %in% value] = NA
me_cars = drop_na(me_cars)

#REMOVE DUPLICATES
me_cars %<>% distinct() 
me_cars %>% select(-Driving,-Fuel_Type,-Transmission,-Name,-Currency, -Price, -Area) %>% gather() %>% 
  ggplot(aes(value)) + 
geom_boxplot(fill="skyblue", color="black", outlier.colour = "deepskyblue") + 
    facet_wrap(~key, scales = 'free')+
  theme_minimal()

me_cars %>% select(-Driving,-Fuel_Type,-Transmission,-Name,-Currency, -Area, -Price) %>% gather() %>% 
  ggplot(aes(value)) +
geom_histogram(fill="skyblue", color="black") + 
    facet_wrap(~key, scales = 'free')+
  theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Now we have obtained a nice dataset to work with.

me_cars %>% select(-Driving,-Fuel_Type,-Transmission,-Name,-Currency,-Price, -Area) %>% cor() %>% corrplot(type = "full", 
         tl.col = "black", tl.srt = 45)

We see how many variables are quite correlated with each other. This correlation will be better analysed later.

We plot a chart of the correlation between numeric variables and PriceEURO

pr_cor<-me_cars %>% select(-Driving,-Fuel_Type,-Transmission,-Name,-Currency,-Price, -Area) %>% cor() %>% data.frame()
pr_cor<-pr_cor[-15,]
ggplot(pr_cor, aes(x=pr_cor %>% rownames(), y=PriceEURO))+
  geom_bar(fill=ifelse(pr_cor$PriceEURO>0,"deepskyblue","red") ,stat="identity")+
  labs(x="\nVariables",
       y="Correlation with PriceEURO")+
  guides(x =  guide_axis(angle = 45))+
  theme(axis.text = element_text(face="bold"),
        panel.background = element_rect(fill = "white",
                                colour = "white"),
        panel.grid.major = element_line(linewidth = 0.01, linetype = 'solid',
                                colour = "gray88"))

SUPERVISED LEARNING

Tests for normality and linear regression

me_cars %>% 
  ggplot(aes(PriceEURO)) +
  geom_density(linewidth=1)+
  theme_minimal()

Not normal at all. We try to take the log

me_cars %>% 
  ggplot(aes(log(PriceEURO))) +
  geom_density(linewidth=1)+
  stat_overlay_normal_density(color="red", linetype="dashed", linewidth=1)+
  theme_minimal()

Much better than before! Let’s try to make statistical tests on this.

To do so we are going to create a new dataset called “mec_stat”, meaning Middle East Cars Stat without some of the useless variable we had in the original dataset.

Then we create a ds for each of the 5 Areas, namely: - Bahrain (BR) - Kuwait (KW) - Oman (OM) - Qatar (QT) - Saudi Arabia (SA) - United Arab Emirates (UAE)

mec_stat<-me_cars %>% select(-Name,-Price, -Currency) #general ds for statistical analysis

#a bit naive but necessary
mec_statSA <- mec_stat %>% filter(Area=="Saudi Arabia") %>% select(-Area)
mec_statUAE <- mec_stat %>% filter(Area=="UAE") %>% select(-Area)
mec_statBR <- mec_stat %>% filter(Area=="Bahrain") %>% select(-Area)
mec_statOM <- mec_stat %>% filter(Area=="Oman") %>% select(-Area)
mec_statQT <- mec_stat %>% filter(Area=="Qatar") %>% select(-Area)
mec_statKW <- mec_stat %>% filter(Area=="Kuwait") %>% select(-Area)
mec_map1 <- data.frame(region=c("Bahrain", "Kuwait", "Qatar", "Oman", "Saudi Arabia", "United Arab Emirates"),
                       AvgPrice=c(mean((mec_statBR$PriceEURO)),
                                  mean((mec_statKW$PriceEURO)),
                                  mean((mec_statQT$PriceEURO)),
                                  mean((mec_statOM$PriceEURO)),
                                  mean((mec_statSA$PriceEURO)),
                                  mean((mec_statUAE$PriceEURO))))

mapdata<-map_data("world")
mapdata <- left_join(mapdata, mec_map1, by="region") 
mapdata<-mapdata %>% filter(!is.na(mapdata$AvgPrice))

ggplot(mapdata, aes(x=long, y=lat, group=group))+
  geom_polygon(aes(fill=AvgPrice), color="black")+
  scale_fill_gradient(name="Average car price in EURO\n", low="#CCFFFF", high="#000066",
                      #ticks for the legend
                      limit=c(min(mapdata$AvgPrice), max(mapdata$AvgPrice)),
                      breaks=seq(from=round(min(mapdata$AvgPrice)), 
                                 to=max(mapdata$AvgPrice),
                            by=round((max(mapdata$AvgPrice)-round(min(mapdata$AvgPrice)))/4,2)))+
  
  theme(axis.text.x = element_blank(),
        axis.text.y = element_blank(),
        axis.ticks = element_blank(),
        axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        rect = element_blank())

We now take the Shapiro-Wilk test to verify whether the distribution of the variables is normal or not.

cat("Bahrain")
## Bahrain
shapiro.test(log(mec_statBR$PriceEURO))
## 
##  Shapiro-Wilk normality test
## 
## data:  log(mec_statBR$PriceEURO)
## W = 0.9905, p-value = 5.992e-06
cat("\nKuwait")
## 
## Kuwait
shapiro.test(log(mec_statKW$PriceEURO))
## 
##  Shapiro-Wilk normality test
## 
## data:  log(mec_statKW$PriceEURO)
## W = 0.98714, p-value = 3.242e-05
cat("\nQatar")
## 
## Qatar
shapiro.test(log(mec_statQT$PriceEURO))
## 
##  Shapiro-Wilk normality test
## 
## data:  log(mec_statQT$PriceEURO)
## W = 0.99397, p-value = 0.01684
cat("\nOman")
## 
## Oman
shapiro.test(log(mec_statOM$PriceEURO))
## 
##  Shapiro-Wilk normality test
## 
## data:  log(mec_statOM$PriceEURO)
## W = 0.99664, p-value = 0.2442
cat("\nSaudi Arabia")
## 
## Saudi Arabia
shapiro.test(log(mec_statSA$PriceEURO))
## 
##  Shapiro-Wilk normality test
## 
## data:  log(mec_statSA$PriceEURO)
## W = 0.97415, p-value = 1.969e-08
cat("\nUnited Arab Emirates")
## 
## United Arab Emirates
shapiro.test(log(mec_statUAE$PriceEURO))
## 
##  Shapiro-Wilk normality test
## 
## data:  log(mec_statUAE$PriceEURO)
## W = 0.98677, p-value = 4.05e-06

We can see that, apart from Oman, all the other distributions are not-gaussian. So we check for the distribution over the residuals which, by the way, is the important distribution to check for making inference with ols.

model1BR=mec_statBR %>% lm(log(PriceEURO)~.,.)
cat("\nBahrain")
## 
## Bahrain
shapiro.test(model1BR$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  model1BR$residuals
## W = 0.98647, p-value = 7.518e-08
model1KW=mec_statKW %>% lm(log(PriceEURO)~.,.)
cat("\nKuwait")
## 
## Kuwait
shapiro.test(model1KW$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  model1KW$residuals
## W = 0.99327, p-value = 0.00766
model1OM=mec_statOM %>% lm(log(PriceEURO)~.,.)
cat("\nOman")
## 
## Oman
shapiro.test(model1OM$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  model1OM$residuals
## W = 0.99217, p-value = 0.003011
model1QT=mec_statQT %>% lm(log(PriceEURO)~.,.)
cat("\nQatar")
## 
## Qatar
shapiro.test(model1QT$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  model1QT$residuals
## W = 0.99132, p-value = 0.001321
model1SA=mec_statSA %>% lm(log(PriceEURO)~.,.)
cat("\nSaudi Arabia")
## 
## Saudi Arabia
shapiro.test(model1SA$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  model1SA$residuals
## W = 0.9558, p-value = 5.581e-12
model1UAE=mec_statUAE %>% lm(log(PriceEURO)~.,.)
cat("\nUnited Arab Emirates")
## 
## United Arab Emirates
shapiro.test(model1UAE$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  model1UAE$residuals
## W = 0.99684, p-value = 0.167

We see that Kuwait, Oman and UAE have normally distributed error while the other countries have not. In order to be consistent with the analysis I decided not to consider the linear regression since not all the results where unbiased.

Further analysis is needed

Now we initialize a function for computing the Vif (variance inflation factor) and for plotting its values.

# VIF FUNCTION

vif_func<-function(model_name, country){
  modelvif=data.frame(vif(model_name)) #we compute the vif of the model
  modelvif$row=row.names(modelvif) #we do some magic to let R know what we are working with
  colnames(modelvif)[1] = "Vif"
  
  
  #Plotting the Vif
  modelvif %>%
  ggplot(aes(row, Vif))+
  geom_bar(stat="identity")+
  geom_hline(yintercept = 5, linewidth=1, color="yellow")+
  geom_hline(yintercept = 7.5, linewidth=1, color="orange")+
  geom_hline(yintercept = 10, linewidth=1, color="red")+
  scale_y_continuous(limits = c(0,30), breaks = seq(0,30,2.5), minor_breaks = NULL)+
  labs(x=NULL, y=NULL, title = bquote(paste("Vif value per variable - Area: ",.(country))))+
  coord_flip()+
  theme_minimal()

}

We initialize also a function for obtaining the best subset possible.

#BEST SUBSET MODEL
best_sub<-function(ds,country){
  best_sub<-regsubsets(x= log(PriceEURO)~.,data = ds, nvmax = length(names(ds)), 
                       method = "forward") #we do the subset
  best_sub_summary<- best_sub %>% summary()
  best_sub_summary$cp %>% which.min()
  plot(best_sub, scale="Cp") #we plot the best one
  title(main=country)
}

We do again the regression for convenience.

model1BR=mec_statBR %>% lm(log(PriceEURO)~.,.,)
model1KW=mec_statKW %>% lm(log(PriceEURO)~.,.)
model1QT=mec_statQT %>% lm(log(PriceEURO)~.,.)
model1OM=mec_statOM %>% lm(log(PriceEURO)~.,.)
model1SA=mec_statSA %>% lm(log(PriceEURO)~.,.)
model1UAE=mec_statUAE %>% lm(log(PriceEURO)~.,.)

First we check the Vif for all the models and…

vif_func(model1BR,"Bahrain")

vif_func(model1KW,"Kuwait")

vif_func(model1QT,"Qatar")

vif_func(model1OM,"Oman")

vif_func(model1SA,"Saudi Arabia")

vif_func(model1UAE,"United Arab Emirates")

… we eliminate the most correlated variables:

mec_statBR <- mec_statBR %>% select(-Horsepower, -Engine_Capacity)
mec_statKW <- mec_statKW %>% select(-Horsepower, -Engine_Capacity)
mec_statQT <- mec_statQT %>% select(-Horsepower, -Engine_Capacity)
mec_statSA <- mec_statSA %>% select(-Horsepower, -Engine_Capacity)
mec_statOM <- mec_statOM %>% select(-Horsepower, -Engine_Capacity)
mec_statUAE <- mec_statUAE %>% select(-Length, -Horsepower,-Engine_Capacity)

Now, with the leftover variables, we compute the best possible subset for each model and…

mec_statBR %>% best_sub("Bahrain")

mec_statKW %>% best_sub("Kuwait")

mec_statQT %>% best_sub("Qatar")

mec_statOM %>% best_sub("Oman")

mec_statSA %>% best_sub("Saudi Arabia")

mec_statUAE %>% best_sub("UAE")

… again we remove the non-important ones and create some dummies if necessary. To be clear, the non-important variables are the ones that at the top of the chart have a white rectangle.

#Selecting only the important variables

#BAHRAIN
mec_statBR <- mec_statBR %>% select(-Transmission)

#KUWAIT
mec_statKW <- mec_statKW %>% select(-c(Acceleration_0100, Length))

#QATAR
mec_statQT$TransMan <-ifelse(mec_statQT$Transmission=="Manual",1,0)
mec_statQT <- mec_statQT %>% select(-Transmission, -Fuel_Type, -Length)

#OMAN
mec_statOM$FT_Hybrid <-ifelse(mec_statOM$Fuel_Type=="Hybrid",0,1)
mec_statOM <- mec_statOM %>% select(-Length, -Transmission, -Fuel_Type)

#Saudi Arabia
mec_statSA$DrivingFWD <-ifelse(mec_statSA$Driving=="Front Wheel Drive",1,0)
mec_statSA <- mec_statSA %>% select(-Driving, -Transmission, Width)

#United Arab Emirates
mec_statUAE$DrivingFWD <-ifelse(mec_statUAE$Driving=="Front Wheel Drive",1,0)
mec_statUAE$FT_Hybrid <-ifelse(mec_statUAE$Fuel_Type=="Hybrid",1,0)
mec_statUAE <- mec_statUAE %>% select(-Driving,-Fuel_Type,-Transmission)

We now retrain the linear model with the new datasets

model1BR=mec_statBR %>% lm(log(PriceEURO)~.,.,)
model1KW=mec_statKW %>% lm(log(PriceEURO)~.,.)
model1OM=mec_statOM %>% lm(log(PriceEURO)~.,.)
model1QT=mec_statQT %>% lm(log(PriceEURO)~.,.)
model1SA=mec_statSA %>% lm(log(PriceEURO)~.,.)
model1UAE=mec_statUAE %>% lm(log(PriceEURO)~.,.)

Check again the Vif and notice that things got way better.

vif_func(model1BR,"Bahrain")

vif_func(model1KW,"Kuwait")

vif_func(model1OM,"Oman")

vif_func(model1QT,"Qatar")

vif_func(model1SA,"Saudi Arabia")

vif_func(model1UAE,"UAE")

Now we have cleaned the dataset as much as possible. We don’t have variables that suffer from multicollinearity and we have selected the best subset of variables, this seems perfect!!

What about using the robust regression since…

ols_plot_resid_lev(model1KW)

… there are so many outliers?

Initializing a function to compute the robust regression. We are not scaling the variable PriceEURO and we are removing, if present, the variable DrivingFWD because it was giving problems.

rob_reg<-function(ds){
  set.seed(1)
  index <- sample(2, nrow(ds), prob = c(0.8, 0.2), replace = TRUE) 
  #so we give to each observation a prob of 80% of falling into the train and 20% of falling into the test
  
  PriceEURO<-ds$PriceEURO

  ds<-ds %>% select(where(is.numeric), -PriceEURO) %>% scale() %>% data.frame() %>% cbind(PriceEURO)
  
  Train <- ds[index==1, ] # Train data
  Test <- ds[index == 2, ] #Test data
  model1<- Train %>% lmrob(log(PriceEURO)~.,.)
  predicted<-predict(model1, newdata=Test)
  cat("\nR²: ", 
      1-(
        (sum((log(Test$PriceEURO)-predicted)^2))/
          (sum((log(Test$PriceEURO)-mean(log(Test$PriceEURO)))^2))),"\n")
  model1 %>% summary() %>% print()
  model1$coefficients %>% sort(decreasing = TRUE)
}

From the following cell it is possible to retrieve the most influent variables in the model by looking among the significant ones at those with the highest coefficient in absolute value.

cat("BAHRAIN:")
## BAHRAIN:
mec_statBR %>% rob_reg()
## 
## R²:  0.8886621 
## 
## Call:
## lmrob(formula = log(PriceEURO) ~ ., data = .)
##  \--> method = "MM"
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -1.3266935 -0.1435016  0.0006825  0.1543759  0.9993950 
## 
## Coefficients:
##                   Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)       10.75787    0.00919 1170.563  < 2e-16 ***
## Cylinders          0.12830    0.03256    3.940 8.89e-05 ***
## Fuel_Capacity      0.16452    0.02103    7.824 1.69e-14 ***
## Liters_For_100km  -0.06353    0.02377   -2.672  0.00769 ** 
## Torque             0.19791    0.04304    4.598 4.98e-06 ***
## Top_Speed          0.20916    0.02949    7.093 2.98e-12 ***
## Seating_Capacity  -0.10199    0.01979   -5.155 3.23e-07 ***
## Acceleration_0100 -0.15107    0.02553   -5.916 4.95e-09 ***
## Length            -0.04884    0.03079   -1.586  0.11313    
## Width              0.05286    0.01205    4.389 1.30e-05 ***
## Height             0.05734    0.02375    2.414  0.01601 *  
## Wheelbase          0.10379    0.03156    3.289  0.00105 ** 
## Trunk_Capacity    -0.02819    0.01204   -2.342  0.01946 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Robust residual standard error: 0.2282 
## Multiple R-squared:   0.91,  Adjusted R-squared:  0.9086 
## Convergence in 15 IRWLS iterations
## 
## Robustness weights: 
##  2 observations c(310,476) are outliers with |weight| = 0 ( < 0.00013); 
##  75 weights are ~= 1. The remaining 705 ones are summarized as
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.01118 0.86620 0.94990 0.89130 0.98420 0.99900 
## Algorithmic parameters: 
##        tuning.chi                bb        tuning.psi        refine.tol 
##         1.548e+00         5.000e-01         4.685e+00         1.000e-07 
##           rel.tol         scale.tol         solve.tol          zero.tol 
##         1.000e-07         1.000e-10         1.000e-07         1.000e-10 
##       eps.outlier             eps.x warn.limit.reject warn.limit.meanrw 
##         1.279e-04         8.041e-12         5.000e-01         5.000e-01 
##      nResample         max.it       best.r.s       k.fast.s          k.max 
##            500             50              2              1            200 
##    maxit.scale      trace.lev            mts     compute.rd fast.s.large.n 
##            200              0           1000              0           2000 
##                   psi           subsampling                   cov 
##            "bisquare"         "nonsingular"         ".vcov.avar1" 
## compute.outlier.stats 
##                  "SM" 
## seed : int(0)
##       (Intercept)         Top_Speed            Torque     Fuel_Capacity 
##       10.75787271        0.20915652        0.19791278        0.16451978 
##         Cylinders         Wheelbase            Height             Width 
##        0.12830315        0.10379297        0.05733867        0.05286172 
##    Trunk_Capacity            Length  Liters_For_100km  Seating_Capacity 
##       -0.02819339       -0.04884005       -0.06352557       -0.10199139 
## Acceleration_0100 
##       -0.15106847
cat("\n\nKUWAIT:")
## 
## 
## KUWAIT:
mec_statKW  %>% rob_reg()
## 
## R²:  0.8531936 
## 
## Call:
## lmrob(formula = log(PriceEURO) ~ ., data = .)
##  \--> method = "MM"
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.836638 -0.166968  0.006574  0.162523  0.941895 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      10.728674   0.012552 854.770  < 2e-16 ***
## Cylinders         0.044608   0.026862   1.661 0.097436 .  
## Fuel_Capacity     0.105613   0.025441   4.151 3.91e-05 ***
## Liters_For_100km -0.008757   0.025763  -0.340 0.734077    
## Torque            0.304085   0.031588   9.627  < 2e-16 ***
## Top_Speed         0.356867   0.027357  13.045  < 2e-16 ***
## Seating_Capacity -0.088859   0.021868  -4.063 5.65e-05 ***
## Width             0.054069   0.014514   3.725 0.000218 ***
## Height            0.092683   0.024166   3.835 0.000142 ***
## Wheelbase         0.065010   0.021467   3.028 0.002592 ** 
## Trunk_Capacity   -0.039214   0.014135  -2.774 0.005750 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Robust residual standard error: 0.236 
## Multiple R-squared:  0.9022, Adjusted R-squared:  0.9002 
## Convergence in 14 IRWLS iterations
## 
## Robustness weights: 
##  36 weights are ~= 1. The remaining 454 ones are summarized as
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.07512 0.85280 0.94990 0.89160 0.98610 0.99900 
## Algorithmic parameters: 
##        tuning.chi                bb        tuning.psi        refine.tol 
##         1.548e+00         5.000e-01         4.685e+00         1.000e-07 
##           rel.tol         scale.tol         solve.tol          zero.tol 
##         1.000e-07         1.000e-10         1.000e-07         1.000e-10 
##       eps.outlier             eps.x warn.limit.reject warn.limit.meanrw 
##         2.041e-04         7.438e-12         5.000e-01         5.000e-01 
##      nResample         max.it       best.r.s       k.fast.s          k.max 
##            500             50              2              1            200 
##    maxit.scale      trace.lev            mts     compute.rd fast.s.large.n 
##            200              0           1000              0           2000 
##                   psi           subsampling                   cov 
##            "bisquare"         "nonsingular"         ".vcov.avar1" 
## compute.outlier.stats 
##                  "SM" 
## seed : int(0)
##      (Intercept)        Top_Speed           Torque    Fuel_Capacity 
##     10.728674391      0.356867458      0.304084544      0.105613081 
##           Height        Wheelbase            Width        Cylinders 
##      0.092682928      0.065009742      0.054069309      0.044608112 
## Liters_For_100km   Trunk_Capacity Seating_Capacity 
##     -0.008756928     -0.039214365     -0.088858677
cat("\n\nOMAN:")
## 
## 
## OMAN:
mec_statOM %>% rob_reg()
## 
## R²:  0.8713075 
## 
## Call:
## lmrob(formula = log(PriceEURO) ~ ., data = .)
##  \--> method = "MM"
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.986764 -0.141222 -0.008141  0.145942  0.694959 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       10.88905    0.01106 984.820  < 2e-16 ***
## Cylinders          0.12628    0.02672   4.727 3.02e-06 ***
## Fuel_Capacity      0.07831    0.02242   3.493 0.000522 ***
## Liters_For_100km  -0.02534    0.02835  -0.894 0.371951    
## Torque             0.20533    0.04073   5.041 6.62e-07 ***
## Top_Speed          0.20531    0.02657   7.727 6.70e-14 ***
## Seating_Capacity  -0.12736    0.02316  -5.499 6.29e-08 ***
## Acceleration_0100 -0.13522    0.02691  -5.025 7.16e-07 ***
## Width              0.04335    0.01222   3.546 0.000430 ***
## Height             0.10878    0.02146   5.069 5.76e-07 ***
## Wheelbase          0.11992    0.02447   4.901 1.31e-06 ***
## Trunk_Capacity    -0.06123    0.01572  -3.896 0.000112 ***
## FT_Hybrid         -0.03505    0.02227  -1.574 0.116159    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Robust residual standard error: 0.2102 
## Multiple R-squared:  0.9207, Adjusted R-squared:  0.9187 
## Convergence in 17 IRWLS iterations
## 
## Robustness weights: 
##  observation 177 is an outlier with |weight| = 0 ( < 0.00021); 
##  42 weights are ~= 1. The remaining 441 ones are summarized as
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.01013 0.86590 0.94780 0.88690 0.98470 0.99900 
## Algorithmic parameters: 
##        tuning.chi                bb        tuning.psi        refine.tol 
##         1.548e+00         5.000e-01         4.685e+00         1.000e-07 
##           rel.tol         scale.tol         solve.tol          zero.tol 
##         1.000e-07         1.000e-10         1.000e-07         1.000e-10 
##       eps.outlier             eps.x warn.limit.reject warn.limit.meanrw 
##         2.066e-04         1.033e-11         5.000e-01         5.000e-01 
##      nResample         max.it       best.r.s       k.fast.s          k.max 
##            500             50              2              1            200 
##    maxit.scale      trace.lev            mts     compute.rd fast.s.large.n 
##            200              0           1000              0           2000 
##                   psi           subsampling                   cov 
##            "bisquare"         "nonsingular"         ".vcov.avar1" 
## compute.outlier.stats 
##                  "SM" 
## seed : int(0)
##       (Intercept)            Torque         Top_Speed         Cylinders 
##       10.88905288        0.20533215        0.20530627        0.12627742 
##         Wheelbase            Height     Fuel_Capacity             Width 
##        0.11991893        0.10878440        0.07830963        0.04334625 
##  Liters_For_100km         FT_Hybrid    Trunk_Capacity  Seating_Capacity 
##       -0.02533854       -0.03504849       -0.06122890       -0.12735750 
## Acceleration_0100 
##       -0.13521684
cat("\n\nQatar:")
## 
## 
## Qatar:
mec_statQT %>% rob_reg()
## 
## R²:  0.8709802 
## 
## Call:
## lmrob(formula = log(PriceEURO) ~ ., data = .)
##  \--> method = "MM"
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.9661351 -0.1550684  0.0003846  0.1459891  0.7342822 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       10.84969    0.01159 935.754  < 2e-16 ***
## Cylinders          0.06471    0.02883   2.244 0.025268 *  
## Fuel_Capacity      0.10305    0.02605   3.955 8.82e-05 ***
## Liters_For_100km  -0.02533    0.03858  -0.657 0.511792    
## Torque             0.25658    0.03864   6.641 8.59e-11 ***
## Top_Speed          0.24346    0.03410   7.139 3.58e-12 ***
## Seating_Capacity  -0.10463    0.02880  -3.633 0.000311 ***
## Acceleration_0100 -0.11464    0.03233  -3.546 0.000430 ***
## Width              0.04319    0.01461   2.957 0.003264 ** 
## Height             0.06940    0.02405   2.886 0.004081 ** 
## Wheelbase          0.11998    0.02649   4.529 7.50e-06 ***
## Trunk_Capacity    -0.03779    0.01861  -2.031 0.042812 *  
## TransMan          -0.02446    0.01636  -1.494 0.135727    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Robust residual standard error: 0.2125 
## Multiple R-squared:  0.9184, Adjusted R-squared:  0.9164 
## Convergence in 17 IRWLS iterations
## 
## Robustness weights: 
##  45 weights are ~= 1. The remaining 440 ones are summarized as
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.003354 0.843000 0.943100 0.879200 0.984300 0.999000 
## Algorithmic parameters: 
##        tuning.chi                bb        tuning.psi        refine.tol 
##         1.548e+00         5.000e-01         4.685e+00         1.000e-07 
##           rel.tol         scale.tol         solve.tol          zero.tol 
##         1.000e-07         1.000e-10         1.000e-07         1.000e-10 
##       eps.outlier             eps.x warn.limit.reject warn.limit.meanrw 
##         2.062e-04         9.127e-12         5.000e-01         5.000e-01 
##      nResample         max.it       best.r.s       k.fast.s          k.max 
##            500             50              2              1            200 
##    maxit.scale      trace.lev            mts     compute.rd fast.s.large.n 
##            200              0           1000              0           2000 
##                   psi           subsampling                   cov 
##            "bisquare"         "nonsingular"         ".vcov.avar1" 
## compute.outlier.stats 
##                  "SM" 
## seed : int(0)
##       (Intercept)            Torque         Top_Speed         Wheelbase 
##       10.84968805        0.25658402        0.24346222        0.11997904 
##     Fuel_Capacity            Height         Cylinders             Width 
##        0.10304928        0.06940083        0.06471392        0.04318722 
##          TransMan  Liters_For_100km    Trunk_Capacity  Seating_Capacity 
##       -0.02445611       -0.02533065       -0.03778802       -0.10463409 
## Acceleration_0100 
##       -0.11463592
cat("\n\nSAUDI ARABIA")
## 
## 
## SAUDI ARABIA
mec_statSA %>% rob_reg()
## 
## R²:  0.8458653 
## 
## Call:
## lmrob(formula = log(PriceEURO) ~ ., data = .)
##  \--> method = "MM"
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.94595 -0.15643 -0.00349  0.15680  2.40420 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       10.84434    0.01779 609.513  < 2e-16 ***
## Cylinders          0.06789    0.04890   1.388 0.165764    
## Fuel_Capacity      0.11017    0.03579   3.078 0.002211 ** 
## Liters_For_100km  -0.04877    0.03256  -1.498 0.134882    
## Torque             0.25097    0.04716   5.322 1.64e-07 ***
## Top_Speed          0.26074    0.03752   6.949 1.32e-11 ***
## Seating_Capacity  -0.11413    0.02819  -4.048 6.09e-05 ***
## Acceleration_0100 -0.12793    0.02940  -4.352 1.68e-05 ***
## Length            -0.07267    0.03654  -1.989 0.047329 *  
## Width              0.03083    0.01993   1.547 0.122537    
## Height             0.09445    0.02551   3.703 0.000240 ***
## Wheelbase          0.13202    0.04306   3.066 0.002302 ** 
## Trunk_Capacity    -0.02472    0.01550  -1.595 0.111397    
## DrivingFWD        -0.06886    0.01770  -3.891 0.000115 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Robust residual standard error: 0.2317 
## Multiple R-squared:  0.9047, Adjusted R-squared:  0.9019 
## Convergence in 24 IRWLS iterations
## 
## Robustness weights: 
##  observation 21 is an outlier with |weight| = 0 ( < 0.00022); 
##  39 weights are ~= 1. The remaining 415 ones are summarized as
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.04663 0.86680 0.94850 0.88530 0.98380 0.99900 
## Algorithmic parameters: 
##        tuning.chi                bb        tuning.psi        refine.tol 
##         1.548e+00         5.000e-01         4.685e+00         1.000e-07 
##           rel.tol         scale.tol         solve.tol          zero.tol 
##         1.000e-07         1.000e-10         1.000e-07         1.000e-10 
##       eps.outlier             eps.x warn.limit.reject warn.limit.meanrw 
##         2.198e-04         6.690e-12         5.000e-01         5.000e-01 
##      nResample         max.it       best.r.s       k.fast.s          k.max 
##            500             50              2              1            200 
##    maxit.scale      trace.lev            mts     compute.rd fast.s.large.n 
##            200              0           1000              0           2000 
##                   psi           subsampling                   cov 
##            "bisquare"         "nonsingular"         ".vcov.avar1" 
## compute.outlier.stats 
##                  "SM" 
## seed : int(0)
##       (Intercept)         Top_Speed            Torque         Wheelbase 
##       10.84434211        0.26073533        0.25097195        0.13201844 
##     Fuel_Capacity            Height         Cylinders             Width 
##        0.11016927        0.09445222        0.06789084        0.03083190 
##    Trunk_Capacity  Liters_For_100km        DrivingFWD            Length 
##       -0.02471781       -0.04877284       -0.06885610       -0.07267291 
##  Seating_Capacity Acceleration_0100 
##       -0.11412580       -0.12793431
cat("\n\nUAE:")
## 
## 
## UAE:
mec_statUAE %>% rob_reg()
## 
## R²:  0.8914368 
## 
## Call:
## lmrob(formula = log(PriceEURO) ~ ., data = .)
##  \--> method = "MM"
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.953682 -0.168612  0.006339  0.163670  0.796055 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       10.76691    0.01295 831.355  < 2e-16 ***
## Cylinders          0.14844    0.03677   4.037 6.17e-05 ***
## Fuel_Capacity      0.08232    0.03366   2.446 0.014765 *  
## Liters_For_100km  -0.05864    0.03819  -1.535 0.125258    
## Torque             0.22845    0.04836   4.724 2.91e-06 ***
## Top_Speed          0.21031    0.02788   7.544 1.82e-13 ***
## Seating_Capacity  -0.07130    0.02204  -3.235 0.001286 ** 
## Acceleration_0100 -0.11047    0.02824  -3.911 0.000103 ***
## Width              0.07893    0.01660   4.754 2.53e-06 ***
## Height             0.04918    0.02158   2.279 0.023015 *  
## Wheelbase          0.05888    0.02324   2.534 0.011559 *  
## Trunk_Capacity    -0.03038    0.01670  -1.820 0.069313 .  
## DrivingFWD        -0.10175    0.01532  -6.641 7.31e-11 ***
## FT_Hybrid          0.02419    0.02077   1.165 0.244633    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Robust residual standard error: 0.2379 
## Multiple R-squared:  0.9055, Adjusted R-squared:  0.9033 
## Convergence in 20 IRWLS iterations
## 
## Robustness weights: 
##  46 weights are ~= 1. The remaining 536 ones are summarized as
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.07166 0.83870 0.94750 0.88390 0.98450 0.99900 
## Algorithmic parameters: 
##        tuning.chi                bb        tuning.psi        refine.tol 
##         1.548e+00         5.000e-01         4.685e+00         1.000e-07 
##           rel.tol         scale.tol         solve.tol          zero.tol 
##         1.000e-07         1.000e-10         1.000e-07         1.000e-10 
##       eps.outlier             eps.x warn.limit.reject warn.limit.meanrw 
##         1.718e-04         1.026e-11         5.000e-01         5.000e-01 
##      nResample         max.it       best.r.s       k.fast.s          k.max 
##            500             50              2              1            200 
##    maxit.scale      trace.lev            mts     compute.rd fast.s.large.n 
##            200              0           1000              0           2000 
##                   psi           subsampling                   cov 
##            "bisquare"         "nonsingular"         ".vcov.avar1" 
## compute.outlier.stats 
##                  "SM" 
## seed : int(0)
##       (Intercept)            Torque         Top_Speed         Cylinders 
##       10.76690504        0.22844869        0.21030951        0.14843925 
##     Fuel_Capacity             Width         Wheelbase            Height 
##        0.08231884        0.07893028        0.05887695        0.04918252 
##         FT_Hybrid    Trunk_Capacity  Liters_For_100km  Seating_Capacity 
##        0.02418564       -0.03038375       -0.05864114       -0.07129609 
##        DrivingFWD Acceleration_0100 
##       -0.10174564       -0.11047159
#Storing the value of the R2 of the robust regression
R2rr<-c(0.8886621,
0.8531936,
0.8713075,
0.8709802,
0.8458653,
0.8914368)
mec_map5 <- data.frame(region=c("Bahrain", "Kuwait", "Oman", "Qatar", "Saudi Arabia", "United Arab Emirates"),
                       R2rr=R2rr)

mapdata<-map_data("world")
mapdata <- left_join(mapdata, mec_map5, by="region") 
mapdata<-mapdata %>% filter(!is.na(mapdata$R2rr))

ggplot(mapdata, aes(x=long, y=lat, group=group))+
  geom_polygon(aes(fill=R2rr), color="black")+
  scale_fill_gradient(name="R² for Robust Regression\n", low="#CCFFFF", high="#000066",
                      limit=c(.84, .90),
                      breaks=seq(from=.84,
                                 to=.90,
                            by=.01))+
  
  theme(axis.text.x = element_blank(),
        axis.text.y = element_blank(),
        axis.ticks = element_blank(),
        axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        rect = element_blank())

TREES

Since neither the linear nor the robust linear model were good, for a reason or another, we could try with regression trees.

— Notice that we are doing first a single regression tree before doing the random forest

tree_constructor <- function(ds) { #this function works only with rpart objects
  set.seed(1)
  index <- sample(2, nrow(ds), prob = c(0.8, 0.2), replace = TRUE) #so we give to each observation a prob of 80% of falling into the train and 20% of falling into the test

  Train <- ds[index==1, ] # Train data
  Test <- ds[index == 2, ] #Test data
  model1<- Train %>% rpart(log(PriceEURO)~.,., method="anova", model = TRUE) 
  #we use anova because we are doing regression
  
  
  cp <- which.min(model1$cptable[, "xerror"]) %>% model1$cptable[., "CP"]
  #we select the min value of the column xerror of the cptable (where cp stands for the cost of pruning )
  
  
  model1_pr <- prune(tree = model1, cp =cp)
  model1_pr %>% rpart.plot(roundint = TRUE, digits = 4, left=FALSE)
  
  predicted<-predict(model1_pr, newdata=Test)
  cat("\nThe MSE is:",mean(predicted-log(Test$PriceEURO))^2,"\n")

cat("\nR²: ", 
      1-(
        (sum((log(Test$PriceEURO)-predicted)^2))/
          (sum((log(Test$PriceEURO)-mean(log(Test$PriceEURO)))^2))),"\n")
print(data.frame(model1_pr$variable.importance))
  return(1-(
        (sum((log(Test$PriceEURO)-predicted)^2))/
          (sum((log(Test$PriceEURO)-mean(log(Test$PriceEURO)))^2))))

}

We can now see the best single tree for each country

R2Tree<- c(tree_constructor(mec_statBR), #create a variable R2Tree for the Rsquared
tree_constructor(mec_statKW),
tree_constructor(mec_statOM),
tree_constructor(mec_statQT),
tree_constructor(mec_statSA),
tree_constructor(mec_statUAE))

## 
## The MSE is: 2.397884e-05 
## 
## R²:  0.8180443 
##                   model1_pr.variable.importance
## Torque                               385.544836
## Acceleration_0100                    270.107073
## Top_Speed                            236.847364
## Cylinders                            199.457032
## Fuel_Capacity                        140.951901
## Driving                              105.660058
## Width                                 33.797930
## Length                                21.015265
## Height                                 9.387311
## Liters_For_100km                       6.879841
## Seating_Capacity                       6.484826
## Wheelbase                              6.391267
## Trunk_Capacity                         3.175046

## 
## The MSE is: 5.513108e-05 
## 
## R²:  0.7998163 
##                  model1_pr.variable.importance
## Torque                              241.759365
## Top_Speed                           144.147067
## Cylinders                           110.595604
## Wheelbase                           110.555304
## Fuel_Capacity                       104.176723
## Width                                80.915688
## Driving                              18.318286
## Liters_For_100km                      5.865978
## Height                                5.226239
## Trunk_Capacity                        4.562545
## Seating_Capacity                      1.808048
## Fuel_Type                             1.181117

## 
## The MSE is: 0.0004196496 
## 
## R²:  0.835828 
##                   model1_pr.variable.importance
## Torque                              244.5673703
## Acceleration_0100                   163.8823343
## Top_Speed                           136.7798951
## Cylinders                           127.2608099
## Wheelbase                           107.0998685
## Fuel_Capacity                       105.7481582
## Driving                              20.2455195
## Width                                 7.6473476
## Liters_For_100km                      3.1402231
## Height                                0.8976608
## Trunk_Capacity                        0.8976608

## 
## The MSE is: 0.001506967 
## 
## R²:  0.8542003 
##                   model1_pr.variable.importance
## Torque                              246.4207359
## Acceleration_0100                   183.6716560
## Top_Speed                           160.5372171
## Cylinders                           132.1265966
## Wheelbase                            86.0343215
## Liters_For_100km                     84.5581803
## Width                                16.3161102
## Fuel_Capacity                        11.7870555
## Driving                              10.1884367
## Height                                4.6239124
## Trunk_Capacity                        2.2538783
## Seating_Capacity                      0.1224919

## 
## The MSE is: 0.0003354466 
## 
## R²:  0.7114432 
##                   model1_pr.variable.importance
## Torque                               235.438646
## Acceleration_0100                    146.160622
## Top_Speed                            118.817794
## Cylinders                             93.715447
## Wheelbase                             80.757720
## Width                                 67.227059
## Fuel_Capacity                         21.263345
## DrivingFWD                            15.013701
## Length                                 8.855546
## Liters_For_100km                       5.238668
## Trunk_Capacity                         4.427914
## Fuel_Type                              2.249831
## Height                                 2.225605
## Seating_Capacity                       1.907661

## 
## The MSE is: 0.001250991 
## 
## R²:  0.8046047 
##                   model1_pr.variable.importance
## Torque                               321.645929
## Acceleration_0100                    211.774480
## Top_Speed                            186.399227
## Cylinders                            172.916515
## Fuel_Capacity                        142.437278
## Width                                130.365838
## Wheelbase                             14.010734
## Liters_For_100km                       7.829249
## Height                                 6.972115
## Trunk_Capacity                         5.352463
mec_map2 <- data.frame(region=c("Bahrain", "Kuwait", "Oman", "Qatar", "Saudi Arabia", "United Arab Emirates"),
                       R2Tree=R2Tree)

mapdata<-map_data("world")
mapdata <- left_join(mapdata, mec_map2, by="region") 
mapdata<-mapdata %>% filter(!is.na(mapdata$R2Tree))

ggplot(mapdata, aes(x=long, y=lat, group=group))+
  geom_polygon(aes(fill=R2Tree), color="black")+
  scale_fill_gradient(name="R² for Regression Trees\n", low="#CCFFFF", high="#000066",
                      limit=c(.70, .90),
                      breaks=seq(from=.70,
                                 to=.90,
                            by=.04))+
  
  theme(axis.text.x = element_blank(),
        axis.text.y = element_blank(),
        axis.ticks = element_blank(),
        axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        rect = element_blank())

Not bad but this is the worst model obtained so far.

Random Forest algorithm

rf_model <- function(ds, country) {
  set.seed(1)
  
  index <- sample(2, nrow(ds), prob = c(0.8, 0.2), replace = TRUE) #so we give to each observation a prob of 80% of falling into the train and 20% of falling into the test

  Train <- ds[index==1, ] # Train data
  Test <- ds[index==2, ]
  
  ##### BEST MTRY #####
  
  # Define the cross validation
  control <- trainControl(method = "cv",  
                        number = 10,  #number of folds
                        search = "random")  

  grid <- data.frame(mtry = c(2, 3, 4))  # 

  # train the model
  best_mtry_model <- train(log(PriceEURO) ~ .,  
                      data = Train, 
                      method = "rf", 
                      trControl = control,
                      tuneGrid=grid)
  
  cat("\nThe best nr of mtry for min error is: ", best_mtry_model$bestTune[1,1])
  
  ##### BEST NTREE #####
  
  tree_nr <- seq(from= 50, to= 500, by=50)  # number of trees to test

  MSE_train<-c()
  
  for (i in 1:length(tree_nr)) {
    
    model_rf <- randomForest(log(PriceEURO) ~ ., data = Train, ntree = tree_nr[i],
                             mtry=best_mtry_model$bestTune[1,1])
    
    MSE_train<-append(MSE_train, model_rf$mse[i]) #append the mse to the vector
  }


# MSE error minimizer
best_ntree <- min(MSE_train[MSE_train<0.09 & MSE_train>0.04]) %>% match(MSE_train) %>% tree_nr[.] #best_ntree is the number such that the MSE is within 0.05 and 0.09 to avoid overfitting. These values have been found thanks to empirical tests
  
  
  cat("\nThe best nr of trees for min error is: ", best_ntree)
  
  
  
  rf1=randomForest(log(PriceEURO)~.,
                   data=Train,
                   ntree=best_ntree,
                   mtry=best_mtry_model$bestTune[1,1],
                   importance=TRUE)
  
  print(rf1) 
  plot(rf1, main=country)
  varImpPlot(rf1, main=country)
  
    yhat <- predict(rf1,newdata = Test)
  plot (yhat , Test$PriceEURO %>% log(), xlab="Predicted ln(Price)",ylab = "Actual ln(Price)", main=country)
  
  abline (0, 1, col="red")
  cat("Mean squared error: ",mean(yhat - log(Test$PriceEURO))^2)
  cat("\nR²: ", 
      1-(
        (sum((log(Test$PriceEURO)-yhat)^2))/
        (sum((log(Test$PriceEURO)-mean(log(Test$PriceEURO)))^2))))
}
#NEW
rf_model(mec_statBR, "Bahrain")
## 
## The best nr of mtry for min error is:  4
## The best nr of trees for min error is:  500
## Call:
##  randomForest(formula = log(PriceEURO) ~ ., data = Train, ntree = best_ntree,      mtry = best_mtry_model$bestTune[1, 1], importance = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 4
## 
##           Mean of squared residuals: 0.03080628
##                     % Var explained: 94.96

## Mean squared error:  5.706505e-05
## R²:  0.9461617
#NEW
rf_model(mec_statKW, "Kuwait")
## 
## The best nr of mtry for min error is:  4
## The best nr of trees for min error is:  500
## Call:
##  randomForest(formula = log(PriceEURO) ~ ., data = Train, ntree = best_ntree,      mtry = best_mtry_model$bestTune[1, 1], importance = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 4
## 
##           Mean of squared residuals: 0.03652742
##                     % Var explained: 93.99

## Mean squared error:  0.0004108964
## R²:  0.926028
#NEW
rf_model(mec_statOM, "Oman")
## 
## The best nr of mtry for min error is:  3
## The best nr of trees for min error is:  500
## Call:
##  randomForest(formula = log(PriceEURO) ~ ., data = Train, ntree = best_ntree,      mtry = best_mtry_model$bestTune[1, 1], importance = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##           Mean of squared residuals: 0.03564172
##                     % Var explained: 94.13

## Mean squared error:  0.0004943466
## R²:  0.9390241
#NEW
rf_model(mec_statQT, "Qatar")
## 
## The best nr of mtry for min error is:  4
## The best nr of trees for min error is:  350
## Call:
##  randomForest(formula = log(PriceEURO) ~ ., data = Train, ntree = best_ntree,      mtry = best_mtry_model$bestTune[1, 1], importance = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 350
## No. of variables tried at each split: 4
## 
##           Mean of squared residuals: 0.0354777
##                     % Var explained: 94.27

## Mean squared error:  0.0002218077
## R²:  0.9436625
#NEW
rf_model(mec_statSA, "Saudi Arabia")
## 
## The best nr of mtry for min error is:  4
## The best nr of trees for min error is:  500
## Call:
##  randomForest(formula = log(PriceEURO) ~ ., data = Train, ntree = best_ntree,      mtry = best_mtry_model$bestTune[1, 1], importance = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 4
## 
##           Mean of squared residuals: 0.05466865
##                     % Var explained: 91.56

## Mean squared error:  0.0002745637
## R²:  0.9077597
#NEW
rf_model(mec_statUAE, "United Arab Emirates")
## 
## The best nr of mtry for min error is:  3
## The best nr of trees for min error is:  500
## Call:
##  randomForest(formula = log(PriceEURO) ~ ., data = Train, ntree = best_ntree,      mtry = best_mtry_model$bestTune[1, 1], importance = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##           Mean of squared residuals: 0.04145605
##                     % Var explained: 93.89

## Mean squared error:  0.0001357007
## R²:  0.9362491
R2rf<-c(0.9458579, 0.9260215,0.9434955, 0.9389488, 0.9077597, 0.9359294)

mec_map3 <- data.frame(region=c("Bahrain", "Kuwait", "Qatar", "Oman", "Saudi Arabia", "United Arab Emirates"),
                       R2rf=R2rf)

mapdata<-map_data("world")
mapdata <- left_join(mapdata, mec_map3, by="region") 
mapdata<-mapdata %>% filter(!is.na(mapdata$R2rf))

ggplot(mapdata, aes(x=long, y=lat, group=group))+
  geom_polygon(aes(fill=R2rf), color="black")+
  scale_fill_gradient(name="R² for Random Forest\n", low="#CCFFFF", high="#000066",
                      limit=c(.90, .95),
                      breaks=seq(from=.90,
                                 to=.95,
                            by=.01))+
  
  theme(axis.text.x = element_blank(),
        axis.text.y = element_blank(),
        axis.ticks = element_blank(),
        axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        rect = element_blank())

The random forest is clearly the best model even though it is a bit slow.

R2df<-left_join(mec_map2,mec_map3, by="region")
R2df$R2rr<-R2rr

R2best_Tr_rr<-c()
for (i in (1:6)){
  
  if (R2df[i,2]>R2df[i,4]){
    
  R2best_Tr_rr[i]<-"Regression Tree"
  } else if (R2df[i,2]==R2df[i,4]) {
    R2best_Tr_rr[i]<-"Tie"
  }else {
      R2best_Tr_rr[i]<-"Robust Regression"
  }
  
  }
  
R2df$R2best_Tr_rr<-R2best_Tr_rr

R2df$R2best_Tr_rr %<>% as.factor
R2df
##                 region    R2Tree      R2rf      R2rr      R2best_Tr_rr
## 1              Bahrain 0.8180443 0.9458579 0.8886621 Robust Regression
## 2               Kuwait 0.7998163 0.9260215 0.8531936 Robust Regression
## 3                 Oman 0.8358280 0.9389488 0.8713075 Robust Regression
## 4                Qatar 0.8542003 0.9434955 0.8709802 Robust Regression
## 5         Saudi Arabia 0.7114432 0.9077597 0.8458653 Robust Regression
## 6 United Arab Emirates 0.8046047 0.9359294 0.8914368 Robust Regression

In this map we are comparing R2 to see which is the highest

mapdata<-map_data("world")
mapdata <- left_join(mapdata, R2df, by="region") 
mapdata<-mapdata %>% filter(!is.na(mapdata$R2best_Tr_rr))

ggplot(mapdata, aes(x=long, y=lat, group=group))+
  geom_polygon(aes(fill=R2best_Tr_rr), color="white")+
    scale_fill_manual(name="Best model according to R²:\n",labels=c("Robust Regression", "Regression Tree","Tie"),
                      values=c( "#000066", "#CCFFFF", "lightgrey"))+
  theme(axis.text.x = element_blank(),
        axis.text.y = element_blank(),
        axis.ticks = element_blank(),
        axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        rect = element_blank())

UNSUPERVISED LEARNING

For this section I decided to use the PCA. Since the PCA works better with correlated variables, I decided to restore the original datasets, excluding just the variable Area

mec_statSA2 <- mec_stat %>% filter(Area=="Saudi Arabia") %>% select(-Area)
mec_statUAE2 <- mec_stat %>% filter(Area=="UAE") %>% select(-Area)
mec_statBR2 <- mec_stat %>% filter(Area=="Bahrain") %>% select(-Area)
mec_statQT2 <- mec_stat %>% filter(Area=="Qatar") %>% select(-Area)
mec_statOM2 <- mec_stat %>% filter(Area=="Oman") %>% select(-Area)
mec_statKW2 <- mec_stat %>% filter(Area=="Kuwait") %>% select(-Area)
mec_statSA2 %>% glimpse() #for example
## Rows: 565
## Columns: 18
## $ Engine_Capacity   <dbl> 1.2, 1.6, 1.5, 1.4, 1.6, 1.4, 1.4, 1.5, 1.6, 1.4, 1.…
## $ Cylinders         <dbl> 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4…
## $ Driving           <fct> Front Wheel Drive, Front Wheel Drive, Front Wheel Dr…
## $ Fuel_Capacity     <dbl> 42, 50, 48, 35, 50, 45, 45, 48, 45, 35, 60, 35, 48, …
## $ Liters_For_100km  <dbl> 4.9, 6.4, 5.8, 5.1, 6.4, 6.3, 6.3, 5.8, 6.4, 5.1, 7.…
## $ Fuel_Type         <fct> Petrol, Petrol, Petrol, Petrol, Petrol, Petrol, Petr…
## $ Horsepower        <int> 76, 102, 112, 98, 102, 100, 100, 112, 123, 98, 130, …
## $ Torque            <dbl> 100, 145, 150, 127, 145, 132, 132, 150, 151, 127, 17…
## $ Transmission      <fct> Automatic, Automatic, Automatic, Automatic, Automati…
## $ Top_Speed         <dbl> 170, 180, 170, 170, 180, 183, 183, 170, 193, 170, 18…
## $ Seating_Capacity  <dbl> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5…
## $ Acceleration_0100 <dbl> 14.0, 11.0, 10.9, 12.0, 11.0, 13.4, 13.4, 10.9, 11.3…
## $ Length            <dbl> 4.24, 4.35, 4.31, 3.64, 4.35, 4.44, 4.44, 4.31, 4.44…
## $ Width             <dbl> 1.67, 1.99, 1.81, 1.60, 1.99, 1.73, 1.73, 1.81, 1.73…
## $ Height            <dbl> 1.51, 1.53, 1.62, 1.48, 1.53, 1.48, 1.48, 1.62, 1.48…
## $ Wheelbase         <dbl> 2.55, 2.63, 2.58, 2.38, 2.63, 2.60, 2.60, 2.58, 2.60…
## $ Trunk_Capacity    <dbl> 450, 510, 448, 314, 510, 396, 396, 448, 396, 314, 33…
## $ PriceEURO         <dbl> 8524.75, 11232.50, 14446.75, 13447.50, 13695.00, 133…

Some data cleaning not to have categorical variables

#BAHRAIN
mec_statBR2$DrivingFWD <- ifelse(mec_statBR2$Driving=="Front Wheel Drive", 1,0)
mec_statBR2$DrivingAWD <- ifelse(mec_statBR2$Driving=="All Wheel Drive", 1,0)
mec_statBR2$DrivingRWD <- ifelse(mec_statBR2$Driving=="Rear Wheel Drive", 1,0)
mec_statBR2$TransMan <- ifelse(mec_statBR2$Transmission=="Manual", 1,0)
mec_statBR2$TransCVT <- ifelse(mec_statBR2$Transmission=="CVT", 1,0)
mec_statBR2$TransAuto <- ifelse(mec_statBR2$Transmission=="Automatic", 1,0)
mec_statBR2$FTHybr <- ifelse(mec_statBR2$Fuel_Type=="Hybrid", 1,0)
mec_statBR2$FTPetr <- ifelse(mec_statBR2$Fuel_Type=="Petrol", 1,0)
mec_statBR2$FTDiesel <- ifelse(mec_statBR2$Fuel_Type=="Diesel", 1,0)
mec_statBR2 <- mec_statBR2 %>% select(-c(Driving, Transmission, Fuel_Type, PriceEURO))

#KUWAIT
mec_statKW2$DrivingFWD <- ifelse(mec_statKW2$Driving=="Front Wheel Drive", 1,0)
mec_statKW2$DrivingAWD <- ifelse(mec_statKW2$Driving=="All Wheel Drive", 1,0)
mec_statKW2$DrivingRWD <- ifelse(mec_statKW2$Driving=="Rear Wheel Drive", 1,0)
mec_statKW2$TransMan <- ifelse(mec_statKW2$Transmission=="Manual", 1,0)
mec_statKW2$TransCVT <- ifelse(mec_statKW2$Transmission=="CVT", 1,0)
mec_statKW2$TransAuto <- ifelse(mec_statKW2$Transmission=="Automatic", 1,0)
mec_statKW2$FTHybr <- ifelse(mec_statKW2$Fuel_Type=="Hybrid", 1,0)
mec_statKW2$FTPetr <- ifelse(mec_statKW2$Fuel_Type=="Petrol", 1,0)
mec_statKW2$FTDiesel <- ifelse(mec_statKW2$Fuel_Type=="Diesel", 1,0)
mec_statKW2 <- mec_statKW2 %>% select(-c(Driving, Transmission, Fuel_Type, PriceEURO))

#QATAR
mec_statQT2$DrivingFWD <- ifelse(mec_statQT2$Driving=="Front Wheel Drive", 1,0)
mec_statQT2$DrivingAWD <- ifelse(mec_statQT2$Driving=="All Wheel Drive", 1,0)
mec_statQT2$DrivingRWD <- ifelse(mec_statQT2$Driving=="Rear Wheel Drive", 1,0)
mec_statQT2$TransMan <- ifelse(mec_statQT2$Transmission=="Manual", 1,0)
mec_statQT2$TransCVT <- ifelse(mec_statQT2$Transmission=="CVT", 1,0)
mec_statQT2$TransAuto <- ifelse(mec_statQT2$Transmission=="Automatic", 1,0)
mec_statQT2$FTHybr <- ifelse(mec_statQT2$Fuel_Type=="Hybrid", 1,0)
mec_statQT2$FTPetr <- ifelse(mec_statQT2$Fuel_Type=="Petrol", 1,0)
mec_statQT2$FTDiesel <- ifelse(mec_statQT2$Fuel_Type=="Diesel", 1,0)
mec_statQT2 <- mec_statQT2 %>% select(-c(Driving, Transmission, Fuel_Type, PriceEURO))

#OMAN
mec_statOM2$DrivingFWD <- ifelse(mec_statOM2$Driving=="Front Wheel Drive", 1,0)
mec_statOM2$DrivingAWD <- ifelse(mec_statOM2$Driving=="All Wheel Drive", 1,0)
mec_statOM2$DrivingRWD <- ifelse(mec_statOM2$Driving=="Rear Wheel Drive", 1,0)
mec_statOM2$TransMan <- ifelse(mec_statOM2$Transmission=="Manual", 1,0)
mec_statOM2$TransCVT <- ifelse(mec_statOM2$Transmission=="CVT", 1,0)
mec_statOM2$TransAuto <- ifelse(mec_statOM2$Transmission=="Automatic", 1,0)
mec_statOM2$FTHybr <- ifelse(mec_statOM2$Fuel_Type=="Hybrid", 1,0)
mec_statOM2$FTPetr <- ifelse(mec_statOM2$Fuel_Type=="Petrol", 1,0)
mec_statOM2$FTDiesel <- ifelse(mec_statOM2$Fuel_Type=="Diesel", 1,0)
mec_statOM2 <- mec_statOM2 %>% select(-c(Driving, Transmission, Fuel_Type, PriceEURO))

#SAUDI ARABIA
mec_statSA2$DrivingFWD <- ifelse(mec_statSA2$Driving=="Front Wheel Drive", 1,0)
mec_statSA2$DrivingAWD <- ifelse(mec_statSA2$Driving=="All Wheel Drive", 1,0)
mec_statSA2$DrivingRWD <- ifelse(mec_statSA2$Driving=="Rear Wheel Drive", 1,0)
mec_statSA2$TransMan <- ifelse(mec_statSA2$Transmission=="Manual", 1,0)
mec_statSA2$TransCVT <- ifelse(mec_statSA2$Transmission=="CVT", 1,0)
mec_statSA2$TransAuto <- ifelse(mec_statSA2$Transmission=="Automatic", 1,0)
mec_statSA2$FTHybr <- ifelse(mec_statSA2$Fuel_Type=="Hybrid", 1,0)
mec_statSA2$FTPetr <- ifelse(mec_statSA2$Fuel_Type=="Petrol", 1,0)
mec_statSA2$FTDiesel <- ifelse(mec_statSA2$Fuel_Type=="Diesel", 1,0)
mec_statSA2 <- mec_statSA2 %>% select(-c(Driving, Transmission, Fuel_Type, PriceEURO))

#UNITED ARAB EMIRATES
mec_statUAE2$DrivingFWD <- ifelse(mec_statUAE2$Driving=="Front Wheel Drive", 1,0)
mec_statUAE2$DrivingAWD <- ifelse(mec_statUAE2$Driving=="All Wheel Drive", 1,0)
mec_statUAE2$DrivingRWD <- ifelse(mec_statUAE2$Driving=="Rear Wheel Drive", 1,0)
mec_statUAE2$TransMan <- ifelse(mec_statUAE2$Transmission=="Manual", 1,0)
mec_statUAE2$TransCVT <- ifelse(mec_statUAE2$Transmission=="CVT", 1,0)
mec_statUAE2$TransAuto <- ifelse(mec_statUAE2$Transmission=="Automatic", 1,0)
mec_statUAE2$FTHybr <- ifelse(mec_statUAE2$Fuel_Type=="Hybrid", 1,0)
mec_statUAE2$FTPetr <- ifelse(mec_statUAE2$Fuel_Type=="Petrol", 1,0)
mec_statUAE2$FTDiesel <- ifelse(mec_statUAE2$Fuel_Type=="Diesel", 1,0)
mec_statUAE2 <- mec_statUAE2 %>% select(-c(Driving, Transmission, Fuel_Type, PriceEURO))

Pca 3D plot for fun:

pcatest <- mec_statBR2 %>% prcomp(scale. = TRUE, center = TRUE)

#3D plot just for fun, not rendered in the html document unfortunately
scores=pcatest$x[,1:3] %>% as.data.frame()
plot3d(scores)

text3d(pcatest$rotation[,1:3]*1000, 
       texts=rownames(pcatest$rotation), 
       col="red", 
       cex=0.8)
 

coords <- c()
for (i in 1:nrow(pcatest$rotation)) {
  coords <- rbind(coords, rbind(c(0,0,0),pcatest$rotation[i,1:3]))
}
 
lines3d(coords*1000, 
        col="orange", 
        lwd=1)

PCA function:

Here we apply the Principal component analysis and we select up to the 7th principal component which is the last one with eigenvalues > 1 meaning that each principal component explains at least as much information as one column in the original datasets.

pca<- function(ds, country){
  pca1 <- ds %>% prcomp(scale. = TRUE, center = TRUE)
    
    var <- pca1 %>% get_pca_var()
    
  corplot<-corrplot(var$cor[,1:7], is.corr = FALSE, col.lim = c(-1,1),cl.align.text = "l",tl.col = "black" )
  

  contrib<-pca1 %>% fviz_contrib(choice = "var", axes = 1)+ggtitle(paste("Contribution of variables to 1st Principal component - ",country))

    pca1 %>% fviz_pca_var(col.var = pca1$rotation[,3], repel = TRUE)+
    ggtitle(paste(country, " PCA"))+
    scale_color_gradient2(name="Correlation with\nthe third component",limits=c(-1,1), low="red3",mid="deepskyblue",high="navy") #circle correlation plot 
    
    return(contrib)
}

PCA results:

pca(mec_statBR2, "Bahrain")

pca(mec_statKW2, "Kuwait")

pca(mec_statOM2, "Oman")

pca(mec_statQT2, "Qatar")

pca(mec_statSA2, "Saudi Arabia")

pca(mec_statUAE2, "United Arab Emirates")

pr_lm <- function(ds_pc,ds_original, request="default"){
  
  if (is.element("PriceEURO", ds_pc %>% colnames())){
    PriceEURO<- ds_pc$PriceEURO
    ds_pc$PriceEURO<-NULL
  }
  
  #PC computation
  pc<-ds_pc %>% prcomp(scale. = TRUE, center=TRUE)
  sign_var<-pc$sdev %>% data.frame() %>% filter(.>1) %>% dim() %>% .[1]
  pc<-pc$x[,1:sign_var] %>% data.frame()
  
  
 if (class(ds_original)=="data.frame"){
   pc$PriceEURO<-ds_original$PriceEURO
 } else {
   pc$PriceEURO<-PriceEURO
 }
  
  
  if (request=="summary"){
    lm_pc=pc %>% lm(log(PriceEURO)~.,.)
    lm_pc %>% summary() %>% return()
    
    
  } else if (request=="tree"){
  tree_constructor(pc)


  }else if (request=="rf") {
  rf_model(pc,"Whatever")
    } else {
    set.seed(1)
  
  index <- sample(2, nrow(pc), prob = c(0.8, 0.2), replace = TRUE) #so we give to each observation a prob of 80% of falling into the train and 20% of falling into the test

  Train <- pc[index==1, ] # Train data
  Test <- pc[index==2, ]
  lm_pc=Train %>% lm(log(PriceEURO)~.,.)
  yhat <- predict(lm_pc, newdata=Test)
  
  #in this case you return the R^2
  return(1-(
        (sum((log(Test$PriceEURO)-yhat)^2))/
        (sum((log(Test$PriceEURO)-mean(log(Test$PriceEURO)))^2))))
  }
  
}
pr_lm(mec_statBR2, mec_statBR,"tree")

## 
## The MSE is: 1.880744e-05 
## 
## R²:  0.798516 
##     model1_pr.variable.importance
## PC1                     384.59112
## PC2                      65.39607
## PC7                      52.82062
## PC3                      49.26478
## PC5                      38.05101
## PC4                      27.54177
## PC6                      25.91887
## [1] 0.798516
pr_lm(mec_statKW2, mec_statKW,"tree")

## 
## The MSE is: 0.0003342216 
## 
## R²:  0.8100026 
##     model1_pr.variable.importance
## PC1                     223.97828
## PC2                      61.00428
## PC4                      48.99104
## PC3                      46.18381
## PC5                      45.43138
## PC7                      38.46012
## PC6                      34.32127
## [1] 0.8100026
pr_lm(mec_statQT2, mec_statQT,"tree")

## 
## The MSE is: 9.983318e-05 
## 
## R²:  0.7812357 
##     model1_pr.variable.importance
## PC1                     230.25238
## PC3                      70.13930
## PC2                      46.75305
## PC6                      43.15769
## PC4                      42.97652
## PC5                      33.72889
## PC7                      15.85542
## [1] 0.7812357
pr_lm(mec_statOM2, mec_statOM,"tree")

## 
## The MSE is: 0.0002256637 
## 
## R²:  0.7608527 
##     model1_pr.variable.importance
## PC1                     230.68425
## PC4                      63.04105
## PC7                      55.52803
## PC3                      44.06926
## PC5                      41.90118
## PC2                      27.48985
## PC6                      21.00088
## [1] 0.7608527
pr_lm(mec_statSA2, mec_statSA,"tree")

## 
## The MSE is: 0.002080731 
## 
## R²:  0.7687297 
##     model1_pr.variable.importance
## PC1                    215.952104
## PC3                     42.810689
## PC5                     41.759841
## PC2                     39.029829
## PC4                     38.591129
## PC6                     16.792633
## PC7                      4.970742
## [1] 0.7687297
pr_lm(mec_statUAE2, mec_statUAE,"tree")

## 
## The MSE is: 0.0005434524 
## 
## R²:  0.7488691 
##     model1_pr.variable.importance
## PC1                     299.15173
## PC2                      63.85345
## PC3                      55.68239
## PC4                      51.89348
## PC5                      43.68406
## PC7                      39.75202
## PC6                      17.61245
## [1] 0.7488691
pr_lm(mec_statBR2, mec_statBR,"summary")
## 
## Call:
## lm(formula = log(PriceEURO) ~ ., data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.34884 -0.15417  0.00088  0.16265  1.16977 
## 
## Coefficients:
##               Estimate Std. Error  t value Pr(>|t|)    
## (Intercept) 10.7508273  0.0089760 1197.724  < 2e-16 ***
## PC1          0.2541204  0.0032524   78.134  < 2e-16 ***
## PC2         -0.0817215  0.0045476  -17.970  < 2e-16 ***
## PC3         -0.0009839  0.0060863   -0.162    0.872    
## PC4          0.0350355  0.0064834    5.404 8.21e-08 ***
## PC5          0.0806239  0.0078819   10.229  < 2e-16 ***
## PC6         -0.0079893  0.0084767   -0.942    0.346    
## PC7          0.0565940  0.0086959    6.508 1.22e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2806 on 969 degrees of freedom
## Multiple R-squared:  0.8721, Adjusted R-squared:  0.8711 
## F-statistic: 943.6 on 7 and 969 DF,  p-value: < 2.2e-16
pr_lm(mec_statKW2, mec_statKW,"summary")
## 
## Call:
## lm(formula = log(PriceEURO) ~ ., data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.00627 -0.16322  0.00031  0.18788  0.89924 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 10.733543   0.011865 904.616  < 2e-16 ***
## PC1          0.250790   0.004333  57.885  < 2e-16 ***
## PC2         -0.117592   0.005965 -19.713  < 2e-16 ***
## PC3         -0.015909   0.007977  -1.994  0.04656 *  
## PC4         -0.050359   0.008234  -6.116 1.72e-09 ***
## PC5          0.075761   0.010386   7.294 9.47e-13 ***
## PC6         -0.036587   0.011247  -3.253  0.00121 ** 
## PC7          0.022133   0.011354   1.949  0.05172 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2935 on 604 degrees of freedom
## Multiple R-squared:  0.8643, Adjusted R-squared:  0.8628 
## F-statistic: 549.7 on 7 and 604 DF,  p-value: < 2.2e-16
pr_lm(mec_statQT2, mec_statQT,"summary")
## 
## Call:
## lm(formula = log(PriceEURO) ~ ., data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.95183 -0.14513  0.00807  0.16910  0.81265 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 10.838716   0.011233 964.897  < 2e-16 ***
## PC1          0.256159   0.004089  62.649  < 2e-16 ***
## PC2         -0.105311   0.005773 -18.241  < 2e-16 ***
## PC3         -0.003016   0.007439  -0.405   0.6853    
## PC4          0.051006   0.007836   6.509 1.61e-10 ***
## PC5          0.079890   0.009573   8.345 4.97e-16 ***
## PC6          0.024388   0.010587   2.304   0.0216 *  
## PC7          0.050646   0.010959   4.622 4.67e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2758 on 595 degrees of freedom
## Multiple R-squared:  0.8808, Adjusted R-squared:  0.8794 
## F-statistic: 628.1 on 7 and 595 DF,  p-value: < 2.2e-16
pr_lm(mec_statOM2, mec_statOM,"summary")
## 
## Call:
## lm(formula = log(PriceEURO) ~ ., data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.98362 -0.15319  0.00409  0.17384  0.87550 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 10.873718   0.011253 966.280  < 2e-16 ***
## PC1          0.249612   0.004098  60.912  < 2e-16 ***
## PC2          0.104470   0.005802  18.006  < 2e-16 ***
## PC3         -0.060138   0.007678  -7.832 2.23e-14 ***
## PC4         -0.002090   0.008060  -0.259  0.79550    
## PC5          0.072024   0.009444   7.627 9.66e-14 ***
## PC6         -0.029007   0.010415  -2.785  0.00552 ** 
## PC7          0.014708   0.010566   1.392  0.16446    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2756 on 592 degrees of freedom
## Multiple R-squared:  0.8755, Adjusted R-squared:  0.874 
## F-statistic: 594.8 on 7 and 592 DF,  p-value: < 2.2e-16
pr_lm(mec_statSA2, mec_statSA,"summary")
## 
## Call:
## lm(formula = log(PriceEURO) ~ ., data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.22289 -0.19099  0.00106  0.17930  2.31963 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 10.859775   0.013900 781.296  < 2e-16 ***
## PC1          0.257093   0.005087  50.536  < 2e-16 ***
## PC2          0.098398   0.006839  14.387  < 2e-16 ***
## PC3         -0.053020   0.008951  -5.923 5.53e-09 ***
## PC4         -0.064143   0.010956  -5.855 8.17e-09 ***
## PC5         -0.003164   0.011570  -0.273    0.785    
## PC6         -0.068201   0.012374  -5.512 5.44e-08 ***
## PC7          0.057317   0.013740   4.172 3.51e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3304 on 557 degrees of freedom
## Multiple R-squared:  0.8379, Adjusted R-squared:  0.8358 
## F-statistic: 411.2 on 7 and 557 DF,  p-value: < 2.2e-16
pr_lm(mec_statUAE2, mec_statUAE,"summary")
## 
## Call:
## lm(formula = log(PriceEURO) ~ ., data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.02425 -0.17093  0.00908  0.18996  0.90457 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 10.760563   0.010930 984.463  < 2e-16 ***
## PC1          0.261156   0.003970  65.785  < 2e-16 ***
## PC2          0.111463   0.005655  19.711  < 2e-16 ***
## PC3         -0.045508   0.007398  -6.151 1.28e-09 ***
## PC4          0.032803   0.007847   4.180 3.27e-05 ***
## PC5          0.087480   0.009194   9.515  < 2e-16 ***
## PC6         -0.006546   0.009883  -0.662    0.508    
## PC7          0.066866   0.010647   6.280 5.87e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2939 on 715 degrees of freedom
## Multiple R-squared:  0.8727, Adjusted R-squared:  0.8715 
## F-statistic: 700.3 on 7 and 715 DF,  p-value: < 2.2e-16
R2pr_lm<- c(pr_lm(mec_statBR2, mec_statBR),
pr_lm(mec_statKW2, mec_statKW),
pr_lm(mec_statQT2, mec_statQT),
pr_lm(mec_statOM2, mec_statOM),
pr_lm(mec_statSA2, mec_statSA),
pr_lm(mec_statUAE2, mec_statUAE))
R2pr_lm
## [1] 0.8709772 0.8297523 0.8581698 0.8348107 0.8206330 0.8653526
mec_map5 <- data.frame(region=c("Bahrain", "Kuwait", "Qatar", "Oman", "Saudi Arabia", "United Arab Emirates"),
                       R2pc_lm=R2pr_lm)

mapdata<-map_data("world")
mapdata <- left_join(mapdata, mec_map5, by="region") 
mapdata<-mapdata %>% filter(!is.na(mapdata$R2pc_lm))

ggplot(mapdata, aes(x=long, y=lat, group=group))+
  geom_polygon(aes(fill=R2pc_lm), color="black")+
  scale_fill_gradient(name="R² for the linear model run on\nthe significant PCs\n", low="#CCFFFF", high="#000066",
                      limit=c(.82, .87),
                      breaks=seq(from=.82,
                                 to=.87,
                            by=.01))+
  
  theme(axis.text.x = element_blank(),
        axis.text.y = element_blank(),
        axis.ticks = element_blank(),
        axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        rect = element_blank())

R2df$R2pca<-R2pr_lm

R2df <- R2df %>% select("region", "R2Tree", "R2rr", "R2pca", "R2rf", "R2best_Tr_rr")
R2df
##                 region    R2Tree      R2rr     R2pca      R2rf
## 1              Bahrain 0.8180443 0.8886621 0.8709772 0.9458579
## 2               Kuwait 0.7998163 0.8531936 0.8297523 0.9260215
## 3                 Oman 0.8358280 0.8713075 0.8581698 0.9389488
## 4                Qatar 0.8542003 0.8709802 0.8348107 0.9434955
## 5         Saudi Arabia 0.7114432 0.8458653 0.8206330 0.9077597
## 6 United Arab Emirates 0.8046047 0.8914368 0.8653526 0.9359294
##        R2best_Tr_rr
## 1 Robust Regression
## 2 Robust Regression
## 3 Robust Regression
## 4 Robust Regression
## 5 Robust Regression
## 6 Robust Regression
pr_lm(mec_statBR2, mec_statBR,"rf")
## 
## The best nr of mtry for min error is:  4
## The best nr of trees for min error is:  500
## Call:
##  randomForest(formula = log(PriceEURO) ~ ., data = Train, ntree = best_ntree,      mtry = best_mtry_model$bestTune[1, 1], importance = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 4
## 
##           Mean of squared residuals: 0.04532261
##                     % Var explained: 92.58

## Mean squared error:  8.394225e-05
## R²:  0.9216441
pr_lm(mec_statKW2, mec_statKW,"rf")
## 
## The best nr of mtry for min error is:  4
## The best nr of trees for min error is:  400
## Call:
##  randomForest(formula = log(PriceEURO) ~ ., data = Train, ntree = best_ntree,      mtry = best_mtry_model$bestTune[1, 1], importance = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 400
## No. of variables tried at each split: 4
## 
##           Mean of squared residuals: 0.05230703
##                     % Var explained: 91.4

## Mean squared error:  0.0002257347
## R²:  0.9108987
pr_lm(mec_statQT2, mec_statQT,"rf")
## 
## The best nr of mtry for min error is:  4
## The best nr of trees for min error is:  500
## Call:
##  randomForest(formula = log(PriceEURO) ~ ., data = Train, ntree = best_ntree,      mtry = best_mtry_model$bestTune[1, 1], importance = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 4
## 
##           Mean of squared residuals: 0.05767763
##                     % Var explained: 90.69

## Mean squared error:  4.920689e-05
## R²:  0.9224049
pr_lm(mec_statOM2, mec_statOM,"rf")
## 
## The best nr of mtry for min error is:  4
## The best nr of trees for min error is:  500
## Call:
##  randomForest(formula = log(PriceEURO) ~ ., data = Train, ntree = best_ntree,      mtry = best_mtry_model$bestTune[1, 1], importance = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 4
## 
##           Mean of squared residuals: 0.05116926
##                     % Var explained: 91.57

## Mean squared error:  0.000670271
## R²:  0.9179071
pr_lm(mec_statSA2, mec_statSA,"rf")
## 
## The best nr of mtry for min error is:  4
## The best nr of trees for min error is:  450
## Call:
##  randomForest(formula = log(PriceEURO) ~ ., data = Train, ntree = best_ntree,      mtry = best_mtry_model$bestTune[1, 1], importance = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 450
## No. of variables tried at each split: 4
## 
##           Mean of squared residuals: 0.0658544
##                     % Var explained: 89.83

## Mean squared error:  6.453369e-05
## R²:  0.8802514
pr_lm(mec_statUAE2, mec_statUAE,"rf")
## 
## The best nr of mtry for min error is:  4
## The best nr of trees for min error is:  400
## Call:
##  randomForest(formula = log(PriceEURO) ~ ., data = Train, ntree = best_ntree,      mtry = best_mtry_model$bestTune[1, 1], importance = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 400
## No. of variables tried at each split: 4
## 
##           Mean of squared residuals: 0.05518666
##                     % Var explained: 91.86

## Mean squared error:  5.156505e-05
## R²:  0.9061594

Analysis of the entire dataset!

mec_stat %>% select(-Horsepower, -Engine_Capacity) %>% tree_constructor()

## 
## The MSE is: 0.0002739199 
## 
## R²:  0.8426183 
##                   model1_pr.variable.importance
## Torque                               1637.17940
## Acceleration_0100                    1105.03200
## Top_Speed                            1012.32237
## Cylinders                             855.20211
## Wheelbase                             650.83485
## Fuel_Capacity                         641.48866
## Width                                 123.21335
## Length                                 35.35770
## Height                                 25.93947
## Trunk_Capacity                         22.12323
## Liters_For_100km                       22.09052
## [1] 0.8426183

I created a new function so that I re inserted the variable Area in the dataset. Not the smartest thing to do but it works.

rob_reg1<-function(ds){
  set.seed(1)
  index <- sample(2, nrow(ds), prob = c(0.8, 0.2), replace = TRUE) 
  #so we give to each observation a prob of 80% of falling into the train and 20% of falling into the test
  
  PriceEURO<-ds$PriceEURO
  Area_<-ds$Area
  
  
  # difference with the rob_reg function !!
  ds<-ds %>% select(where(is.numeric), -PriceEURO) %>% scale() %>% data.frame() %>% cbind(PriceEURO)
  ds<- ds %>% cbind(Area_)
  
  Train <- ds[index==1, ] # Train data
  Test <- ds[index == 2, ] #Test data
  model1<- Train %>% lmrob(log(PriceEURO)~.,.)
  predicted<-predict(model1, newdata=Test)
  cat("\nR²: ", 
      1-(
        (sum((log(Test$PriceEURO)-predicted)^2))/
          (sum((log(Test$PriceEURO)-mean(log(Test$PriceEURO)))^2))),"\n")
  model1 %>% summary() %>% print()
  model1$coefficients %>% sort(decreasing = TRUE)
}
mec_stat %>% select(-Horsepower, -Engine_Capacity) %>% rob_reg1()
## 
## R²:  0.8744916 
## 
## Call:
## lmrob(formula = log(PriceEURO) ~ ., data = .)
##  \--> method = "MM"
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -1.2990007 -0.1589043 -0.0004993  0.1564809  2.4769161 
## 
## Coefficients:
##                    Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)       10.770095   0.008705 1237.285  < 2e-16 ***
## Cylinders          0.088369   0.012959    6.819 1.09e-11 ***
## Fuel_Capacity      0.114334   0.010999   10.395  < 2e-16 ***
## Liters_For_100km  -0.033188   0.012012   -2.763  0.00576 ** 
## Torque             0.246575   0.015042   16.392  < 2e-16 ***
## Top_Speed          0.242983   0.011813   20.568  < 2e-16 ***
## Seating_Capacity  -0.104550   0.009298  -11.244  < 2e-16 ***
## Acceleration_0100 -0.133719   0.010904  -12.264  < 2e-16 ***
## Length            -0.019901   0.014475   -1.375  0.16929    
## Width              0.042182   0.005506    7.661 2.42e-14 ***
## Height             0.085787   0.009158    9.367  < 2e-16 ***
## Wheelbase          0.108329   0.013731    7.890 4.12e-15 ***
## Trunk_Capacity    -0.040947   0.006344   -6.454 1.25e-10 ***
## Area_Kuwait       -0.014420   0.014269   -1.011  0.31228    
## Area_Oman          0.026261   0.013936    1.884  0.05961 .  
## Area_Qatar         0.019414   0.013896    1.397  0.16248    
## Area_Saudi Arabia  0.126977   0.015734    8.070 9.80e-16 ***
## Area_UAE           0.062228   0.014477    4.298 1.77e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Robust residual standard error: 0.2302 
## Multiple R-squared:  0.909,  Adjusted R-squared:  0.9085 
## Convergence in 14 IRWLS iterations
## 
## Robustness weights: 
##  4 observations c(21,936,1355,1495) are outliers with |weight| = 0 ( < 3.1e-05); 
##  257 weights are ~= 1. The remaining 2973 ones are summarized as
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0181  0.8621  0.9507  0.8871  0.9853  0.9990 
## Algorithmic parameters: 
##        tuning.chi                bb        tuning.psi        refine.tol 
##         1.548e+00         5.000e-01         4.685e+00         1.000e-07 
##           rel.tol         scale.tol         solve.tol          zero.tol 
##         1.000e-07         1.000e-10         1.000e-07         1.000e-10 
##       eps.outlier             eps.x warn.limit.reject warn.limit.meanrw 
##         3.092e-05         8.117e-12         5.000e-01         5.000e-01 
##      nResample         max.it         groups        n.group       best.r.s 
##            500             50              5            400              2 
##       k.fast.s          k.max    maxit.scale      trace.lev            mts 
##              1            200            200              0           1000 
##     compute.rd fast.s.large.n 
##              0           2000 
##                   psi           subsampling                   cov 
##            "bisquare"         "nonsingular"         ".vcov.avar1" 
## compute.outlier.stats 
##                  "SM" 
## seed : int(0)
##       (Intercept)            Torque         Top_Speed Area_Saudi Arabia 
##       10.77009494        0.24657548        0.24298325        0.12697726 
##     Fuel_Capacity         Wheelbase         Cylinders            Height 
##        0.11433437        0.10832881        0.08836917        0.08578749 
##          Area_UAE             Width         Area_Oman        Area_Qatar 
##        0.06222784        0.04218167        0.02626120        0.01941357 
##       Area_Kuwait            Length  Liters_For_100km    Trunk_Capacity 
##       -0.01442017       -0.01990056       -0.03318842       -0.04094695 
##  Seating_Capacity Acceleration_0100 
##       -0.10454984       -0.13371887
mec_stat %>% select(-Horsepower, -Engine_Capacity) %>% rf_model(., "Big dataset")
## 
## The best nr of mtry for min error is:  4
## The best nr of trees for min error is:  50
## Call:
##  randomForest(formula = log(PriceEURO) ~ ., data = Train, ntree = best_ntree,      mtry = best_mtry_model$bestTune[1, 1], importance = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 50
## No. of variables tried at each split: 4
## 
##           Mean of squared residuals: 0.01995721
##                     % Var explained: 96.85

## Mean squared error:  8.83196e-05
## R²:  0.9696733
pcBR<- cbind(mec_statBR2, mec_statBR$PriceEURO)
pcBR$Area<-0
pcBR<-pcBR %>% rename("PriceEURO"="mec_statBR$PriceEURO")


pcKW<-cbind(mec_statKW2, mec_statKW$PriceEURO)
pcKW$Area<-1
pcKW<-pcKW %>% rename("PriceEURO"="mec_statKW$PriceEURO")

pcQT<-cbind(mec_statQT2, mec_statQT$PriceEURO)
pcQT$Area<-2
pcQT<-pcQT %>% rename("PriceEURO"="mec_statQT$PriceEURO")

pcOM<-cbind(mec_statOM2, mec_statOM$PriceEURO)
pcOM$Area<-3
pcOM<-pcOM %>% rename("PriceEURO"="mec_statOM$PriceEURO")

pcSA<-cbind(mec_statSA2, mec_statSA$PriceEURO)
pcSA$Area<-4
pcSA<-pcSA %>% rename("PriceEURO"="mec_statSA$PriceEURO")

pcUAE<-cbind(mec_statUAE2, mec_statUAE$PriceEURO)
pcUAE$Area<-5
pcUAE<-pcUAE %>% rename("PriceEURO"="mec_statUAE$PriceEURO")

mec_statPC <- rbind(pcBR, pcKW, pcQT, pcOM,pcSA,pcUAE)
mec_statPC %>% pr_lm(.,0,request="default")
## [1] 0.8605758
mec_statPC %>% pr_lm(.,0,request="tree")

## 
## The MSE is: 2.870222e-05 
## 
## R²:  0.8008268 
##     model1_pr.variable.importance
## PC1                    1532.48062
## PC3                     298.94810
## PC4                     279.02838
## PC2                     201.41527
## PC5                     185.15232
## PC6                     129.45372
## PC7                      90.63286
## [1] 0.8008268
mec_statPC %>% pr_lm(.,0,request="rf")
## 
## The best nr of mtry for min error is:  4
## The best nr of trees for min error is:  400
## Call:
##  randomForest(formula = log(PriceEURO) ~ ., data = Train, ntree = best_ntree,      mtry = best_mtry_model$bestTune[1, 1], importance = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 400
## No. of variables tried at each split: 4
## 
##           Mean of squared residuals: 0.02866165
##                     % Var explained: 95.48

## Mean squared error:  3.447948e-06
## R²:  0.959543
pca(mec_statPC %>% select(-Horsepower, - Engine_Capacity, -PriceEURO), "General model")

mec_statPC %>% select(-Horsepower, - Engine_Capacity, -PriceEURO) %>% prcomp(scale. = TRUE, center = TRUE) %>% fviz_eig(choice = "variance",addlabels=TRUE,xlab = "Principal components",
                  main = "" ,ylab = "",ncp=7,barfill = "deepskyblue", linecolor = "blue") + theme_minimal()